# template elements
2018-02-13
Bobae Kang
(Bobae.Kang@illinois.gov)
Source: Iconfinder.com
Source: r-project.org
“R is a language and environment for statistical computing and graphics.” - The R Foundation
Benefits of using R
Source: Time Magazine
# peak at the first rows of the data
head(ispcrime_tbl)
# A tibble: 6 x 12
year county violentCrime murder rape robbery aggAssault propertyCrime
<int> <fct> <int> <int> <int> <int> <int> <int>
1 2011 Adams 218 0 37 15 166 1555
2 2011 Alexan~ 119 0 14 4 101 290
3 2011 Bond 6 1 0 0 5 211
4 2011 Boone 59 0 24 8 27 733
5 2011 Brown 7 0 1 0 6 38
6 2011 Bureau 42 0 4 3 35 505
# ... with 4 more variables: burglary <int>, larcenyTft <int>,
# MVTft <int>, arson <int>
# get a quick summary of violent crime and property crime
ispcrime_tbl %>%
select(violentCrime, propertyCrime) %>%
summary()
violentCrime propertyCrime
Min. : 0 Min. : 0
1st Qu.: 19 1st Qu.: 133
Median : 42 Median : 349
Mean : 501 Mean : 2913
3rd Qu.: 133 3rd Qu.: 1190
Max. :33348 Max. :178902
NA's :7 NA's :7
# filter to keep only counties starting with C for 2015
# while creating and showing a new variable for total crime count
ispcrime_tbl %>%
filter(substr(county, 1, 1) == "C", year == 2015) %>%
mutate(totalCrime = violentCrime + propertyCrime) %>%
select(year, county, totalCrime)
# A tibble: 12 x 3
year county totalCrime
<int> <fct> <int>
1 2015 Calhoun NA
2 2015 Carroll 176
3 2015 Cass 154
4 2015 Champaign 6486
5 2015 Christian 292
6 2015 Clark 103
7 2015 Clay 191
8 2015 Clinton 423
9 2015 Coles 805
10 2015 Cook 153575
11 2015 Crawford 282
12 2015 Cumberland 42
# get annual average count of violent crime by county
ispcrime_tbl %>%
group_by(county) %>%
summarise(annualAvgCrime = sum(violentCrime, propertyCrime, na.rm = TRUE) / 5)
# A tibble: 102 x 2
county annualAvgCrime
<fct> <dbl>
1 Adams 1724
2 Alexander 385
3 Bond 190
4 Boone 426
5 Brown 39.0
6 Bureau 480
7 Calhoun 13.8
8 Carroll 196
9 Cass 109
10 Champaign 6567
# ... with 92 more rows
# merging regions data and count the number of rows by region
ispcrime_tbl %>%
left_join(regions) %>%
group_by(region) %>%
count()
# A tibble: 4 x 2
# Groups: region [4]
region n
<fct> <int>
1 Central 230
2 Cook 5
3 Northern 85
4 Southern 190
# bar plot of violent crime mean count by region
ggplot(ispcrime_tbl2, aes(x = region, y = violentCrime, fill = region)) +
stat_summary(geom = "bar", fun.y = "mean") +
labs(title = "Violent crime count by region", x = "Region", y = "Count") +
theme_classic(base_size = 15)
# line plot of violent crime trend by region
ggplot(ispcrime_tbl2, aes(x = year, y = violentCrime, color = region)) +
stat_summary(geom = "line", fun.y = "sum", size = 1) +
labs(title = "Violent crime trend by region", x = "Year", y = "Count") +
theme_minimal(base_size = 15) +
scale_color_brewer(palette = "Dark2")
# histogram of violent crime count by county (excluding Cook)
ggplot(filter(ispcrime_tbl2, county != "Cook"), aes(x = violentCrime)) +
geom_histogram(binwidth = 100) +
facet_wrap(~ year) +
labs(x = "Violent crime count", y = "Count") +
theme_classic(base_size = 15)
# choropleth map of violent crime in 2015
qtm(counties,
fill = "violentCrime",
format = "World",
frame = FALSE)
Other examples (1): Word cloud
Other examples (2): Parallel plot
Other examples (3): Network graph
Example - Simple linear model
lm_fit <- lm(violentCrime ~ propertyCrime, ispcrime)
summary(lm_fit)
Call:
lm(formula = violentCrime ~ propertyCrime, data = ispcrime)
Residuals:
Min 1Q Median 3Q Max
-2239.5 -2.2 57.0 78.3 3992.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -79.768287 16.496961 -4.835 1.77e-06 ***
propertyCrime 0.199367 0.001059 188.303 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 363.5 on 501 degrees of freedom
(7 observations deleted due to missingness)
Multiple R-squared: 0.9861, Adjusted R-squared: 0.986
F-statistic: 3.546e+04 on 1 and 501 DF, p-value: < 2.2e-16
# put model fit results in a data frame format
tidy(lm_fit)
term estimate std.error statistic p.value
1 (Intercept) -79.7682868 16.49696109 -4.835332 1.771126e-06
2 propertyCrime 0.1993675 0.00105876 188.302852 0.000000e+00
# get predictions and residuals for each data point
ispcrime %>%
select(year, county, propertyCrime, violentCrime) %>%
add_predictions(lm_fit) %>%
add_residuals(lm_fit) %>%
head()
year county propertyCrime violentCrime pred resid
1 2011 Adams 1555 218 230.24816 -12.248156
2 2011 Alexander 290 119 -21.95172 140.951715
3 2011 Bond 211 6 -37.70175 43.701747
4 2011 Boone 733 59 66.36808 -7.368081
5 2011 Brown 38 7 -72.19232 79.192322
6 2011 Bureau 505 42 20.91229 21.087706
# plot the model fit
plot(violentCrime ~ propertyCrime, ispcrime)
abline(lm_fit)
# show diagnostic plots
par(mfrow=c(2, 2))
plot(lm_fit)
Generalized linear models
# examples of generalized linear models with glm()
logistic_reg <- glm(binary ~ x1 + x2, data = mydata, family = binomial())
poisson_reg <- glm(count ~ x1 + x2, data = mydata, family = poisson())
gamma_reg <- glm(y ~ x1 + x2, data = mydata, family = Gamma())
Other advanced models
stats and forecast packages)spdep and spgwr packages)survival package)network and igraph packages)tm and tidytext packages)caret and mlr packages)